{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "34e66a0e-e41d-48e0-8a1f-b82b5ea18ad1",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Setup OpenAI Agent\n",
    "import openai\n",
    "\n",
    "openai.api_key = \"sk-your-key\"\n",
    "from llama_index.agent import OpenAIAgent"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "eb11c1a6-1540-4538-8d1a-bb8b265fdb64",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "=== Calling Function ===\n",
      "Calling function: process_image with args: {\n",
      "  \"url\": \"https://portal.vision.cognitive.azure.com/dist/assets/ImageCaptioningSample1-bbe41ac5.png\",\n",
      "  \"features\": [\"caption\", \"tags\"]\n",
      "}\n",
      "Got output: {'captionResult': {'text': 'a group of cows grazing in a field', 'confidence': 0.861102819442749}, 'modelVersion': '2023-02-01-preview', 'metadata': {'width': 375, 'height': 250}, 'tagsResult': {'values': [{'name': 'grass', 'confidence': 0.9988070130348206}, {'name': 'outdoor', 'confidence': 0.9931809306144714}, {'name': 'field', 'confidence': 0.9857797622680664}, {'name': 'animal', 'confidence': 0.9708199501037598}, {'name': 'livestock', 'confidence': 0.965355634689331}, {'name': 'cow', 'confidence': 0.954204797744751}, {'name': 'herd', 'confidence': 0.9496941566467285}, {'name': 'ranch', 'confidence': 0.9301772117614746}, {'name': 'mammal', 'confidence': 0.9299676418304443}, {'name': 'dairy cow', 'confidence': 0.9291023015975952}, {'name': 'bovine', 'confidence': 0.9199285507202148}, {'name': 'herding', 'confidence': 0.8967740535736084}, {'name': 'fodder', 'confidence': 0.8817697763442993}, {'name': 'grassland', 'confidence': 0.8811800479888916}, {'name': 'standing', 'confidence': 0.8034635782241821}, {'name': 'pasture', 'confidence': 0.6391813158988953}, {'name': 'grazing', 'confidence': 0.6333702802658081}, {'name': 'farm', 'confidence': 0.6285721063613892}, {'name': 'cattle', 'confidence': 0.5256974697113037}, {'name': 'landscape', 'confidence': 0.4293440878391266}]}}\n",
      "========================\n",
      "The caption for the image is \"a group of cows grazing in a field\". \n",
      "\n",
      "The tags in the image include: grass, outdoor, field, animal, livestock, cow, herd, ranch, mammal, dairy cow, bovine, herding, fodder, grassland, standing, pasture, grazing, farm, cattle, and landscape.\n"
     ]
    }
   ],
   "source": [
    "from llama_index.tools.azure_cv.base import AzureCVToolSpec\n",
    "\n",
    "cv_tool = AzureCVToolSpec(api_key=\"your-key\", resource=\"your-resource\")\n",
    "\n",
    "agent = OpenAIAgent.from_tools(\n",
    "    cv_tool.to_tool_list(),\n",
    "    verbose=True,\n",
    ")\n",
    "\n",
    "print(\n",
    "    agent.chat(\n",
    "        \"caption this image and tell me what tags are in it\"\n",
    "        \" https://portal.vision.cognitive.azure.com/dist/assets/ImageCaptioningSample1-bbe41ac5.png\"\n",
    "    )\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d6be81e1-41a6-48b6-920b-b225c0f16a9b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "=== Calling Function ===\n",
      "Calling function: process_image with args: {\n",
      "  \"url\": \"https://portal.vision.cognitive.azure.com/dist/assets/OCR3-4782f088.jpg\",\n",
      "  \"features\": [\"caption\", \"read\"]\n",
      "}\n",
      "Got output: {'captionResult': {'text': 'close-up of a nutrition label', 'confidence': 0.822258710861206}, 'readResult': 'Nutrition Facts Amount Per Serving\\nServing size: 1 bar (40g)\\nServing Per Package: 4\\nTotal Fat 13g\\nSaturated Fat 1.5g\\nAmount Per Serving\\nTrans Fat 0g\\ncalories 190\\nCholesterol 0mg\\nories from Fat 110\\nSodium 20mg\\nnt Daily Values are based on\\nVitamin A 50\\ncalorie diet', 'modelVersion': '2023-02-01-preview', 'metadata': {'width': 1254, 'height': 704}}\n",
      "========================\n",
      "The caption for the image is \"close-up of a nutrition label\".\n",
      "\n",
      "The text from the image is as follows:\n",
      "\n",
      "\"Nutrition Facts Amount Per Serving\n",
      "Serving size: 1 bar (40g)\n",
      "Serving Per Package: 4\n",
      "Total Fat 13g\n",
      "Saturated Fat 1.5g\n",
      "Amount Per Serving\n",
      "Trans Fat 0g\n",
      "calories 190\n",
      "Cholesterol 0mg\n",
      "ories from Fat 110\n",
      "Sodium 20mg\n",
      "nt Daily Values are based on\n",
      "Vitamin A 50\n",
      "calorie diet\"\n"
     ]
    }
   ],
   "source": [
    "print(\n",
    "    agent.chat(\n",
    "        \"caption this image and read any text\"\n",
    "        \" https://portal.vision.cognitive.azure.com/dist/assets/OCR3-4782f088.jpg\"\n",
    "    )\n",
    ")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
